Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import random
import math
import time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
import operator

Loading the global deaths dataset

In [2]:
df = pd.read_csv("time_series_covid19_deaths_global.csv")
df.head()
Out[2]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 7/4/20 7/5/20 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20 7/11/20 7/12/20 7/13/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 826 864 898 920 936 957 971 994 1010 1012
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 74 76 79 81 83 83 85 89 93 95
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 946 952 959 968 978 988 996 1004 1011 1018
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 52 52 52 52 52 52 52 52 52 52
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 19 19 19 21 21 22 23 23 26 26

5 rows × 178 columns

In [3]:
df1 = df.copy()
df1.drop(["Province/State","Lat","Long"],inplace=True,axis=1)
In [4]:
df1 = df1.melt(id_vars=["Country/Region"], var_name="Date", value_name="Value")
In [5]:
fig = px.line(df1, x="Date", y="Value", title='Deaths reported over time in the world',color='Country/Region')
fig.show()

Top 5 Countries with Highest Deaths Reported

In [26]:
df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20',ascending=False)[:5]
Out[26]:
Country/Region 7/13/20
174 US 135566
23 Brazil 72833
178 United Kingdom 44915
113 Mexico 35491
85 Italy 34967
In [27]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'United Kingdom']['Date'], y=df1[df1["Country/Region"] == 'United Kingdom']['Value'],
                    mode='lines',
                    name='United Kingdom'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Italy']['Date'], y=df1[df1["Country/Region"] == 'Italy']['Value'],
                    mode='lines',
                    name='Italy'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Brazil']['Date'], y=df1[df1["Country/Region"] == 'Brazil']['Value'],
                    mode='lines',
                    name='Brazil'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Mexico']['Date'], y=df1[df1["Country/Region"] == 'Mexico']['Value'],
                    mode='lines',
                    name='Mexico'))
fig.update_layout(
    title = "Time Series Analysis of (Date and Deaths Reported) for Countries with Highest Deaths Reported",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Deaths Reported",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:385: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


Top 5 Countries with lowest Number of Deaths Reported`

In [28]:
df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20')[:5]
Out[28]:
Country/Region 7/13/20
132 Papua New Guinea 0
56 Eritrea 0
169 Timor-Leste 0
60 Fiji 0
69 Grenada 0
In [29]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Timor-Leste']['Date'], y=df1[df1["Country/Region"] == 'Timor-Leste']['Value'],
                    mode='lines',
                    name='Timor-Leste'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Papua New Guinea']['Date'], y=df1[df1["Country/Region"] == 'Papua New Guinea']['Value'],
                    mode='lines',
                    name='Papua New Guinea'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Eritrea']['Date'], y=df1[df1["Country/Region"] == 'Eritrea']['Value'],
                    mode='lines',
                    name='Eritrea'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Fiji']['Date'], y=df1[df1["Country/Region"] == 'Fiji']['Value'],
                    mode='lines',
                    name='Fiji'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Grenada']['Date'], y=df1[df1["Country/Region"] == 'Grenada']['Value'],
                    mode='lines',
                    name='Grenada'))
fig.update_layout(
    title = "Time Series Analysis of (Date and Deaths Reported) for Countries with Lowest Deaths Reported",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Deaths Reported",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:385: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


Log of Deaths Reported Over time

In [10]:
fig = px.line(df1, x="Date", y="Value", title='Log of deaths reported over time for all the countries',color='Country/Region')
fig.update_layout(yaxis_type="log",
                 yaxis = dict(title_text = "log(Deaths Reported)"))
fig.show()
In [30]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'United Kingdom']['Date'], y=df1[df1["Country/Region"] == 'United Kingdom']['Value'],
                    mode='lines',
                    name='United Kingdom'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Italy']['Date'], y=df1[df1["Country/Region"] == 'Italy']['Value'],
                    mode='lines',
                    name='Italy'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Brazil']['Date'], y=df1[df1["Country/Region"] == 'Brazil']['Value'],
                    mode='lines',
                    name='Brazil'))
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Mexico']['Date'], y=df1[df1["Country/Region"] == 'Mexico']['Value'],
                    mode='lines',
                    name='Mexico'))
fig.update_layout(
    title = "Log of deaths reported over time for top 5 countries",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Deaths Reported)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()

Country Specific Graphs

Distribution of Confirmed cases in U.S.

In [12]:
f,ax = plt.subplots(figsize=(15,6))
sns.distplot(df1[df1["Country/Region"] == "US"]["Value"])
plt.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Deaths in U.S.

In [14]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.update_layout(
    title = "Deaths Reported in U.S. over the time period",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Deaths Reported",
        title_font = {"size": 20},
        title_standoff = 25))

Log of deaths reported in U.S.

In [59]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'US']['Date'], y=df1[df1["Country/Region"] == 'US']['Value'],
                    mode='lines',
                    name='US'))
fig.update_layout(
    title = "Deaths Reported in U.S. over the time period",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Deaths Reported)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")
fig.show()
C:\Users\Saurabh\Anaconda3\lib\site-packages\plotly\graph_objs\_deprecations.py:385: DeprecationWarning:

plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


Deaths Reported in India

In [16]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines+markers',
                    name='India'))
fig.update_layout(
    title = "Deaths Reported in India over the time period",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Deaths Reported",
        title_font = {"size": 20},
        title_standoff = 25))

Log of Deaths Reported in India

In [20]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'India']['Date'], y=df1[df1["Country/Region"] == 'India']['Value'],
                    mode='lines+markers',
                    name='India'))
fig.update_layout(
    title = "Deaths Reported in India over the time period",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Deaths Reported)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")

Deaths Reported in Spain

In [18]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Spain']['Date'], y=df1[df1["Country/Region"] == 'Spain']['Value'],
                    mode='lines',
                    name='Spain'))
fig.update_layout(
    title = "Deaths Reported in Spain over the time period",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Deaths Reported",
        title_font = {"size": 20},
        title_standoff = 25))

Log of Deaths Reported in Spain

In [19]:
fig = go.Figure()
fig.add_trace(go.Line(x=df1[df1["Country/Region"] == 'Spain']['Date'], y=df1[df1["Country/Region"] == 'Spain']['Value'],
                    mode='lines',
                    name='Spain'))
fig.update_layout(
    title = "Deaths Reported in Spian over the time period",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "log(Deaths Reported)",
        title_font = {"size": 20},
        title_standoff = 25))
fig.update_layout(yaxis_type="log")

Deaths Reported all over the world

In [34]:
country_tot = df.iloc[:,[1,-1]].groupby("Country/Region").sum().reset_index().sort_values(by='7/13/20',ascending=False)[:20]
In [44]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=country_tot["Country/Region"],
    x= country_tot["7/13/20"],
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=2)
    )
))
fig.update_layout(
    title = "Deaths Reported all over the world",
    xaxis = dict(
        title_text = "Deaths Reported",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Country",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

World Daily increase in Death Cases

In [36]:
world_daily = df1.groupby("Date").sum().reset_index()
In [43]:
fig = go.Figure(go.Bar(
            x=world_daily["Date"],
            y=world_daily["Value"],
            orientation='v'))
fig.update_layout(
    title = "World's rise in Deaths Reported",
    xaxis = dict(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25),
    yaxis = dict(
        title_text = "Deaths Reported",
        title_font = {"size": 20},
        title_standoff = 25))

fig.show()

Prediction Part

In [45]:
df.head()
Out[45]:
Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 7/4/20 7/5/20 7/6/20 7/7/20 7/8/20 7/9/20 7/10/20 7/11/20 7/12/20 7/13/20
0 NaN Afghanistan 33.93911 67.709953 0 0 0 0 0 0 ... 826 864 898 920 936 957 971 994 1010 1012
1 NaN Albania 41.15330 20.168300 0 0 0 0 0 0 ... 74 76 79 81 83 83 85 89 93 95
2 NaN Algeria 28.03390 1.659600 0 0 0 0 0 0 ... 946 952 959 968 978 988 996 1004 1011 1018
3 NaN Andorra 42.50630 1.521800 0 0 0 0 0 0 ... 52 52 52 52 52 52 52 52 52 52
4 NaN Angola -11.20270 17.873900 0 0 0 0 0 0 ... 19 19 19 21 21 22 23 23 26 26

5 rows × 178 columns

In [46]:
columns = df.keys()
confirmed = df.loc[:, columns[4]:columns[-1]]
In [47]:
dates = confirmed.keys()
world_cases = []

for i in dates:
    confirmed_sum = confirmed[i].sum()
    world_cases.append(confirmed_sum)
In [48]:
days_since_1_22 = np.array([i for i in range(len(dates))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
In [49]:
days_in_future = 15
future_forcast = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
adjusted_dates = future_forcast[:-15]
In [50]:
start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forcast_dates = []
for i in range(len(future_forcast)):
    future_forcast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))
In [51]:
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(days_since_1_22, world_cases, test_size=0.15, shuffle=False) 

Prediction using Linear Regression

In [52]:
linear_model = LinearRegression(normalize=True, fit_intercept=True)
linear_model.fit(X_train_confirmed, y_train_confirmed)
test_linear_pred = linear_model.predict(X_test_confirmed)
linear_pred = linear_model.predict(future_forcast)
print('MAE:', mean_absolute_error(test_linear_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_linear_pred, y_test_confirmed))
MAE: 72313.53849460295
MSE: 5334133513.769756
In [53]:
print(linear_model.coef_)
print(linear_model.intercept_)
[[3415.46981843]]
[-106884.15388858]
In [61]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, world_cases)
plt.plot(future_forcast, linear_pred, linestyle='dashed', color='orange')
plt.title('Number of Covid Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Number of Deaths Reported', size=30)
plt.legend(['Deaths Reported', 'Linear Regression Predictions'])
plt.xticks(size=15)
plt.show()

Future Prediction using Linear Regression

In [55]:
print('Linear regression future predictions:')
print(linear_pred[-15:])
Linear regression future predictions:
[[487407.59451904]
 [490823.06433747]
 [494238.53415591]
 [497654.00397434]
 [501069.47379278]
 [504484.94361121]
 [507900.41342964]
 [511315.88324808]
 [514731.35306651]
 [518146.82288495]
 [521562.29270338]
 [524977.76252182]
 [528393.23234025]
 [531808.70215869]
 [535224.17197712]]

Prediction using Support Vector Machines

In [56]:
svm_confirmed = SVR(shrinking=True, kernel='poly',gamma=0.01, epsilon=1,degree=3, C=0.1)
svm_confirmed.fit(X_train_confirmed, y_train_confirmed)
svm_pred = svm_confirmed.predict(future_forcast)
C:\Users\Saurabh\Anaconda3\lib\site-packages\sklearn\utils\validation.py:752: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

In [57]:
# check against testing data
svm_test_pred = svm_confirmed.predict(X_test_confirmed)
plt.figure(figsize=(15,6))
plt.plot(y_test_confirmed)
plt.plot(svm_test_pred)
plt.legend(['Test Data', 'SVM Predictions'])
print('MAE:', mean_absolute_error(svm_test_pred, y_test_confirmed))
print('MSE:',mean_squared_error(svm_test_pred, y_test_confirmed))
MAE: 216999.4721787932
MSE: 51881971007.50892
In [63]:
x = adjusted_dates
y = world_cases
pred = svm_pred
algo_name = 'SVM Predictions'
color = 'purple'
plt.figure(figsize=(15, 8))
plt.plot(x, y)
plt.plot(future_forcast, pred, linestyle='dashed', color=color)
plt.title('Worldwide Coronavirus Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Number of Deaths Reported', size=30)
plt.legend(['Deaths Reported',algo_name], prop={'size': 20})
plt.xticks(size=20)
plt.yticks(size=20)
plt.show()
In [ ]: